导入数据文件'Sample - Superstore.xls',绘制水平柱状图,展示每个商品子类别(Sub-Category)的利润(Profit),并根据利润大小显示颜色。
# Step1. 导入数据文件'Sample - Superstore.xls',并查看数据
import pandas as pd
df = pd.read_excel('./Sample - Superstore.xls',sheet_name='Orders')
df.head()
| Row ID | Order ID | Order Date | Ship Date | Ship Mode | Customer ID | Customer Name | Segment | Country | City | ... | Postal Code | Region | Product ID | Category | Sub-Category | Product Name | Sales | Quantity | Discount | Profit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | CA-2017-152156 | 2017-11-08 | 2017-11-11 | Second Class | CG-12520 | Claire Gute | Consumer | United States | Henderson | ... | 42420.0 | South | FUR-BO-10001798 | Furniture | Bookcases | Bush Somerset Collection Bookcase | 261.9600 | 2 | 0.00 | 41.9136 |
| 1 | 2 | CA-2017-152156 | 2017-11-08 | 2017-11-11 | Second Class | CG-12520 | Claire Gute | Consumer | United States | Henderson | ... | 42420.0 | South | FUR-CH-10000454 | Furniture | Chairs | Hon Deluxe Fabric Upholstered Stacking Chairs,... | 731.9400 | 3 | 0.00 | 219.5820 |
| 2 | 3 | CA-2017-138688 | 2017-06-12 | 2017-06-16 | Second Class | DV-13045 | Darrin Van Huff | Corporate | United States | Los Angeles | ... | 90036.0 | West | OFF-LA-10000240 | Office Supplies | Labels | Self-Adhesive Address Labels for Typewriters b... | 14.6200 | 2 | 0.00 | 6.8714 |
| 3 | 4 | US-2016-108966 | 2016-10-11 | 2016-10-18 | Standard Class | SO-20335 | Sean O'Donnell | Consumer | United States | Fort Lauderdale | ... | 33311.0 | South | FUR-TA-10000577 | Furniture | Tables | Bretford CR4500 Series Slim Rectangular Table | 957.5775 | 5 | 0.45 | -383.0310 |
| 4 | 5 | US-2016-108966 | 2016-10-11 | 2016-10-18 | Standard Class | SO-20335 | Sean O'Donnell | Consumer | United States | Fort Lauderdale | ... | 33311.0 | South | OFF-ST-10000760 | Office Supplies | Storage | Eldon Fold 'N Roll Cart System | 22.3680 | 2 | 0.20 | 2.5164 |
5 rows × 21 columns
# Step2. 数据分析:对Sub-Category进行分组,再对每组的Profit求和(数据聚合与分组操作——GroupBy机制)
data = df.groupby('Sub-Category')['Profit'].sum()
# df['Profit'].groupby(df['Sub-Category']).sum() # 另一种写法
print(data) # 返回Series:左侧是索引index,右侧是值values
print('\n',data.index) # 使用.index属性查看索引
print('\n',data.values) # 使用.values属性或者to_numpy()方法查看值(返回数组)
Sub-Category
Accessories 41936.6357
Appliances 18138.0054
Art 6527.7870
Binders 30221.7633
Bookcases -3472.5560
Chairs 26590.1663
Copiers 55617.8249
Envelopes 6964.1767
Fasteners 949.5182
Furnishings 13059.1436
Labels 5546.2540
Machines 3384.7569
Paper 34053.5693
Phones 44515.7306
Storage 21278.8264
Supplies -1189.0995
Tables -17725.4811
Name: Profit, dtype: float64
Index(['Accessories', 'Appliances', 'Art', 'Binders', 'Bookcases', 'Chairs',
'Copiers', 'Envelopes', 'Fasteners', 'Furnishings', 'Labels',
'Machines', 'Paper', 'Phones', 'Storage', 'Supplies', 'Tables'],
dtype='object', name='Sub-Category')
[ 41936.6357 18138.0054 6527.787 30221.7633 -3472.556 26590.1663
55617.8249 6964.1767 949.5182 13059.1436 5546.254 3384.7569
34053.5693 44515.7306 21278.8264 -1189.0995 -17725.4811]
# Step3. 绘制水平柱状图
import plotly.graph_objects as go
fig = go.Figure(data=go.Bar(
y=data.index,
x=data.values,
marker=dict(
color=data.values,
colorscale=[[0,'#fb8c00'],[0.25,'white'],[1,'#1565c0']],
showscale=True),
orientation='h'
))
fig.update_layout(
title='Profit by Sub-Category',
plot_bgcolor='white',
xaxis=dict(title='Profit',gridcolor='rgba(0,0,0,0.05)'),
yaxis=dict(title='Sub-Category')
)
fig.show()
# 思考:如何设置一个利润阈值?柱形的颜色由是否超过阈值来决定
thred = 25000
colors=['#BAB0AC']*data.shape[0]
for i in range(data.shape[0]):
if data[i]>thred:
colors[i]='#E15759'
fig = go.Figure(data=go.Bar(
y=data.index,
x=data.values,
marker=dict(color=colors),
orientation='h'
))
fig.update_layout(
title='Profit by Sub-Category',
plot_bgcolor='white',
xaxis=dict(title='Profit',gridcolor='rgba(0,0,0,0.05)'),
yaxis=dict(title='Sub-Category'),
)
fig.show()
导入数据文件'Sample - Superstore.xls',使用堆积柱状图展示每个商品子类别(Sub-Category)中,不同细分客户(Segment)的利润(Profit)。
# Step1. 导入数据文件'Sample - Superstore.xls',并查看数据
import pandas as pd
df = pd.read_excel('./Sample - Superstore.xls',sheet_name='Orders')
df.head()
| Row ID | Order ID | Order Date | Ship Date | Ship Mode | Customer ID | Customer Name | Segment | Country | City | ... | Postal Code | Region | Product ID | Category | Sub-Category | Product Name | Sales | Quantity | Discount | Profit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | CA-2017-152156 | 2017-11-08 | 2017-11-11 | Second Class | CG-12520 | Claire Gute | Consumer | United States | Henderson | ... | 42420.0 | South | FUR-BO-10001798 | Furniture | Bookcases | Bush Somerset Collection Bookcase | 261.9600 | 2 | 0.00 | 41.9136 |
| 1 | 2 | CA-2017-152156 | 2017-11-08 | 2017-11-11 | Second Class | CG-12520 | Claire Gute | Consumer | United States | Henderson | ... | 42420.0 | South | FUR-CH-10000454 | Furniture | Chairs | Hon Deluxe Fabric Upholstered Stacking Chairs,... | 731.9400 | 3 | 0.00 | 219.5820 |
| 2 | 3 | CA-2017-138688 | 2017-06-12 | 2017-06-16 | Second Class | DV-13045 | Darrin Van Huff | Corporate | United States | Los Angeles | ... | 90036.0 | West | OFF-LA-10000240 | Office Supplies | Labels | Self-Adhesive Address Labels for Typewriters b... | 14.6200 | 2 | 0.00 | 6.8714 |
| 3 | 4 | US-2016-108966 | 2016-10-11 | 2016-10-18 | Standard Class | SO-20335 | Sean O'Donnell | Consumer | United States | Fort Lauderdale | ... | 33311.0 | South | FUR-TA-10000577 | Furniture | Tables | Bretford CR4500 Series Slim Rectangular Table | 957.5775 | 5 | 0.45 | -383.0310 |
| 4 | 5 | US-2016-108966 | 2016-10-11 | 2016-10-18 | Standard Class | SO-20335 | Sean O'Donnell | Consumer | United States | Fort Lauderdale | ... | 33311.0 | South | OFF-ST-10000760 | Office Supplies | Storage | Eldon Fold 'N Roll Cart System | 22.3680 | 2 | 0.20 | 2.5164 |
5 rows × 21 columns
# Step2. 数据分析:从绘图角度来思考,一个图形fig中有三个trace(Segment),每个trace对应多个柱形(Sub-Category)
# 对Segment和Sub-Category两个指标进行分组,再对每组的Profit求和(数据聚合与分组操作——GroupBy机制)
data = df.groupby(['Segment','Sub-Category'])['Profit'].sum()
# df['Profit'].groupby([df['Segment'],df['Sub-Category']]).sum() # 另一种写法
data
Segment Sub-Category
Consumer Accessories 20735.9225
Appliances 6981.9282
Art 3454.3011
Binders 17995.5972
Bookcases -4435.6382
Chairs 13235.3319
Copiers 24083.7106
Envelopes 3264.4126
Fasteners 576.8008
Furnishings 7919.4227
Labels 3075.9884
Machines 2141.0618
Paper 15534.6436
Phones 23837.1147
Storage 7104.2004
Supplies -1657.5513
Tables -9728.0378
Corporate Accessories 12707.4805
Appliances 7429.8952
Art 2004.6477
Binders 6377.3201
Bookcases 638.4502
Chairs 8344.6565
Copiers 18990.2789
Envelopes 2571.2290
Fasteners 251.9030
Furnishings 3508.2077
Labels 1760.8273
Machines 703.0190
Paper 10361.5468
Phones 11766.2196
Storage 9131.0247
Supplies 338.9264
Tables -4906.4986
Home Office Accessories 8493.2327
Appliances 3726.1820
Art 1068.8382
Binders 5848.8460
Bookcases 324.6320
Chairs 5010.1779
Copiers 12543.8354
Envelopes 1128.5351
Fasteners 120.8144
Furnishings 1631.5132
Labels 709.4383
Machines 540.6761
Paper 8157.3789
Phones 8912.3963
Storage 5043.6013
Supplies 129.5254
Tables -3090.9447
Name: Profit, dtype: float64
# 得到的结果是一个MultiIndex(多重索引)的Series:左侧是两层索引index,右侧是值values
# data.index # 使用index属性查看多重索引
print(data.index.levels[0]) # 查看外层索引
print(data.index.levels[1]) # 查看内层索引
print(data['Consumer']) # 指定任一外层索引,查看内层Series
Index(['Consumer', 'Corporate', 'Home Office'], dtype='object', name='Segment')
Index(['Accessories', 'Appliances', 'Art', 'Binders', 'Bookcases', 'Chairs',
'Copiers', 'Envelopes', 'Fasteners', 'Furnishings', 'Labels',
'Machines', 'Paper', 'Phones', 'Storage', 'Supplies', 'Tables'],
dtype='object', name='Sub-Category')
Sub-Category
Accessories 20735.9225
Appliances 6981.9282
Art 3454.3011
Binders 17995.5972
Bookcases -4435.6382
Chairs 13235.3319
Copiers 24083.7106
Envelopes 3264.4126
Fasteners 576.8008
Furnishings 7919.4227
Labels 3075.9884
Machines 2141.0618
Paper 15534.6436
Phones 23837.1147
Storage 7104.2004
Supplies -1657.5513
Tables -9728.0378
Name: Profit, dtype: float64
# Step3. 绘制堆积柱状图
import plotly.graph_objects as go
fig=go.Figure()
# 使用for循环来遍历segment,绘制多个trace
for i in data.index.levels[0]:
fig.add_trace(
go.Bar(
x=data[i].index, # 当前segment对应Series的索引
y=data[i].values, # 当前segment对应Series的值
name=i # 当前segment名称
))
fig.update(layout=dict(
title='Profit by Sub-Category & Segment',
yaxis_title='Profit',
barmode='stack'
))
fig.show()
# 思考:如何自定义颜色?
fig=go.Figure()
colors=['#76B7B2','#BAB0AC','#FF9DA7'] # 自定义颜色列表(离散值)
# 使用for循环来遍历segment,绘制多个trace
for i,c in zip(data.index.levels[0],colors): # 使用zip()函数同时遍历两个序列
fig.add_trace(
go.Bar(
x=data[i].index, # 当前segment对应Series的索引
y=data[i].values, # 当前segment对应Series的值
name=i, # 当前segment名称
marker=dict(color=c) # 将离散颜色值传递给color参数
))
fig.update(layout=dict(
title='Profit by Sub-Category & Segment',
barmode='stack',
))
fig.show()
导入数据文件'Sample - Superstore.xls',绘制散点图,展示商品子类别(Sub-Category)中'Paper'销售额(Sales)和利润(Profit)的相关关系,用气泡的颜色来展示Discount的取值大小,从而进一步分析这些变量之间的关系。
# Step1. 导入数据:导入文件'Sample - Superstore.xls',并查看数据
import pandas as pd
df = pd.read_excel('Sample - Superstore.xls',sheet_name='Orders')
df.shape
(9994, 21)
# Step2. 数据筛选:对'Sub-Category'中'Paper'产品进行筛选
data = df[df['Sub-Category']=='Paper'] # 布尔值索引
data.shape
(1370, 21)
data.head() # 查看筛选的结果
| Row ID | Order ID | Order Date | Ship Date | Ship Mode | Customer ID | Customer Name | Segment | Country | City | ... | Postal Code | Region | Product ID | Category | Sub-Category | Product Name | Sales | Quantity | Discount | Profit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 12 | 13 | CA-2018-114412 | 2018-04-15 | 2018-04-20 | Standard Class | AA-10480 | Andrew Allen | Consumer | United States | Concord | ... | 28027.0 | South | OFF-PA-10002365 | Office Supplies | Paper | Xerox 1967 | 15.552 | 3 | 0.2 | 5.4432 |
| 34 | 35 | CA-2018-107727 | 2018-10-19 | 2018-10-23 | Second Class | MA-17560 | Matt Abelman | Home Office | United States | Houston | ... | 77095.0 | Central | OFF-PA-10000249 | Office Supplies | Paper | Easy-staple paper | 29.472 | 3 | 0.2 | 9.9468 |
| 56 | 57 | CA-2017-111682 | 2017-06-17 | 2017-06-18 | First Class | TB-21055 | Ted Butterfield | Consumer | United States | Troy | ... | 12180.0 | East | OFF-PA-10001569 | Office Supplies | Paper | Xerox 232 | 32.400 | 5 | 0.0 | 15.5520 |
| 58 | 59 | CA-2017-111682 | 2017-06-17 | 2017-06-18 | First Class | TB-21055 | Ted Butterfield | Consumer | United States | Troy | ... | 12180.0 | East | OFF-PA-10000587 | Office Supplies | Paper | Array Parchment Paper, Assorted Colors | 14.560 | 2 | 0.0 | 6.9888 |
| 64 | 65 | CA-2016-135545 | 2016-11-24 | 2016-11-30 | Standard Class | KM-16720 | Kunst Miller | Consumer | United States | Los Angeles | ... | 90004.0 | West | OFF-PA-10003892 | Office Supplies | Paper | Xerox 1943 | 146.730 | 3 | 0.0 | 68.9631 |
5 rows × 21 columns
# Step3. 绘制散点图
import plotly.graph_objects as go
fig = go.Figure(go.Scatter(
x=data['Sales'],
y=data['Profit'],
mode='markers',
marker=dict(
color=data['Discount'], # 用颜色表示 Discount
size=12 ),
text=data['Discount'],
opacity=0.7
))
fig.update_layout(
title='Sales and Profit Distrubution of Paper',
xaxis=dict(title='Sales'),
yaxis=dict(title='Profit')
)
fig.show()
# 延伸;用气泡的颜色来展示不同地区(Region)的数据点
fig = go.Figure()
for i in data['Region'].unique():
fig.add_trace(go.Scatter(
x = data.loc[data['Region']==i,'Sales'],
y = data.loc[data['Region']==i,'Profit'],
mode='markers',
name=i,
marker_size=12,
opacity=0.7
))
fig.update_layout(
title='Sales and Profit Distrubution of Paper by Region',
xaxis=dict(title='Sales'),
yaxis=dict(title='Profit')
)
fig.show()
导入数据文件'Sample - Superstore.xls',绘制气泡图,展示销售额最高的前50名客户的销售额(X轴)和利润(Y轴)的关系,气泡的大小size和颜色color均体现折扣(Discount)这一变量,交互时增加显示的文本text:Customer Name和Discount(如交互所示)。
# Step1. 导入数据:导入文件'Sample - Superstore.xls',并查看数据
import pandas as pd
df = pd.read_excel('Sample - Superstore.xls',sheet_name='Orders')
df.shape
(9994, 21)
# Step2. 数据分析得到销售额最高的前50名客户的销售额(Sales)、利润(Profit)和折扣(Discount)
data = df.groupby('Customer Name').agg({'Sales':'sum','Discount':'mean','Profit':'sum'})
data = data.sort_values(by='Sales',ascending=False)
data = data[:50]
data
| Sales | Discount | Profit | |
|---|---|---|---|
| Customer Name | |||
| Sean Miller | 25043.0500 | 0.246667 | -1980.7393 |
| Tamara Chand | 19052.2180 | 0.116667 | 8981.3239 |
| Raymond Buch | 15117.3390 | 0.094444 | 6976.0959 |
| Tom Ashbrook | 14595.6200 | 0.080000 | 4703.7883 |
| Adrian Barton | 14473.5710 | 0.240000 | 5444.8055 |
| Ken Lonsdale | 14175.2290 | 0.200000 | 806.8550 |
| Sanjit Chand | 14142.3340 | 0.063636 | 5757.4119 |
| Hunter Lopez | 12873.2980 | 0.018182 | 5622.4292 |
| Sanjit Engle | 12209.4380 | 0.110526 | 2650.6769 |
| Christopher Conant | 12129.0720 | 0.281818 | 2177.0493 |
| Todd Sumrall | 11891.7510 | 0.116667 | 2371.7144 |
| Greg Tran | 11820.1200 | 0.100000 | 2163.4269 |
| Becky Martin | 11789.6300 | 0.168750 | -1659.9581 |
| Seth Vernon | 11470.9500 | 0.156250 | 1199.4242 |
| Caroline Jumper | 11164.9740 | 0.188500 | 858.7414 |
| Clay Ludtke | 10880.5460 | 0.114286 | 1933.7831 |
| Maria Etezadi | 10663.7280 | 0.131818 | 1859.4695 |
| Karen Ferguson | 10604.2660 | 0.033333 | 1660.1386 |
| Bill Shonely | 10501.6530 | 0.011111 | 2616.0644 |
| Edward Hooks | 10310.8800 | 0.071875 | 1393.5154 |
| John Lee | 9799.9230 | 0.088235 | 228.9070 |
| Grant Thornton | 9351.2120 | 0.250000 | -4108.6589 |
| Helen Wasserman | 9300.2540 | 0.045000 | 2164.1611 |
| Tom Boeckenhauer | 9133.9900 | 0.070588 | 2798.3689 |
| Peter Fuller | 9062.8640 | 0.121053 | -614.2943 |
| Christopher Martinez | 8954.0200 | 0.120000 | 3899.8904 |
| Justin Deggeller | 8828.0305 | 0.055882 | 1619.5199 |
| Joe Elijah | 8697.8430 | 0.322727 | 1262.2926 |
| Laura Armstrong | 8673.2220 | 0.115385 | 2059.1199 |
| Pete Kriz | 8646.9340 | 0.076000 | 2038.2676 |
| Daniel Raglin | 8350.8680 | 0.153846 | 2869.0760 |
| Natalie Fritzler | 8322.8260 | 0.250000 | -1695.9714 |
| Karen Daniels | 8282.3580 | 0.187500 | 1107.6952 |
| Nick Crebassa | 8241.7390 | 0.136667 | 1314.7580 |
| Harry Marie | 8236.7648 | 0.231000 | 2437.9836 |
| Keith Dawkins | 8181.2560 | 0.087500 | 3038.6254 |
| Sean Braxton | 8057.8910 | 0.241176 | -2082.7451 |
| Zuschuss Carroll | 8025.7070 | 0.254839 | -1032.1490 |
| Joseph Holt | 7954.9980 | 0.085714 | -644.6982 |
| Nora Preis | 7903.1825 | 0.196154 | 631.2282 |
| Anna Häberlin | 7888.2940 | 0.217391 | 1298.0166 |
| Adam Bellavance | 7755.6200 | 0.044444 | 2054.5885 |
| Jim Epp | 7754.9760 | 0.160000 | 1623.4019 |
| Jane Waco | 7721.7140 | 0.071429 | 2173.7094 |
| Lena Creighton | 7663.1260 | 0.156522 | 1288.3469 |
| John Murray | 7625.0760 | 0.184615 | 1574.6164 |
| Jonathan Doherty | 7610.8640 | 0.075000 | 1050.2668 |
| Patrick O'Brill | 7473.8282 | 0.210000 | 38.4757 |
| Maribeth Schnelling | 7443.6900 | 0.160417 | 844.9355 |
| Rick Wilson | 7397.4010 | 0.134783 | 1586.6273 |
# Step3. 绘制气泡图
import plotly.graph_objects as go
text=[]
for i in range(50):
text.append(('Customer Name: {0}<br>Discount: {1:.2%}').format(data.index[i],data.iloc[i,1]))
fig = go.Figure(go.Scatter(
x=data['Sales'],
y=data['Profit'],
mode='markers',
marker=dict(
color=data['Discount'],
colorscale='viridis',
showscale=True,
size=data['Discount'],
sizeref=2*max(data['Discount'])/(10**2)),
text=text,
))
fig.update_layout(
title='Top-50 Customers\' Sales and Profit',
xaxis_title='Sales',
yaxis_title='Profit'
)
fig.show()
导入数据文件'Sample - Superstore.xls',绘制时间序列图,展示2018年每天的销售额(Sales)和利润(Profit)。
# 导入数据并进行数据分析
import pandas as pd
df = pd.read_excel('Sample - Superstore.xls',sheet_name='Orders')
# 每天销售额和利润
data = df.groupby('Order Date')[['Sales','Profit']].sum()
data
| Sales | Profit | |
|---|---|---|
| Order Date | ||
| 2015-01-03 | 16.4480 | 5.5512 |
| 2015-01-04 | 288.0600 | -65.9901 |
| 2015-01-05 | 19.5360 | 4.8840 |
| 2015-01-06 | 4407.1000 | 1358.0524 |
| 2015-01-07 | 87.1580 | -71.9621 |
| ... | ... | ... |
| 2018-12-26 | 814.5940 | 61.1202 |
| 2018-12-27 | 177.6360 | -31.9742 |
| 2018-12-28 | 1657.3508 | 253.1188 |
| 2018-12-29 | 2915.5340 | 644.4338 |
| 2018-12-30 | 713.7900 | 101.5365 |
1236 rows × 2 columns
# 索引和切片
print(data.loc['2018'],'\n') # 2018年的记录
print(data.loc['2018-02'],'\n') # 2018年2月的记录
print(data['2018-12-01':'2018-12-15']) # 切片
Sales Profit
Order Date
2018-01-01 1481.8280 -181.4109
2018-01-02 2079.5540 -207.0473
2018-01-03 2070.2720 704.2800
2018-01-06 33.7400 15.5204
2018-01-07 3395.5900 758.7192
... ... ...
2018-12-26 814.5940 61.1202
2018-12-27 177.6360 -31.9742
2018-12-28 1657.3508 253.1188
2018-12-29 2915.5340 644.4338
2018-12-30 713.7900 101.5365
[322 rows x 2 columns]
Sales Profit
Order Date
2018-02-02 913.3540 170.6770
2018-02-03 922.3270 215.5700
2018-02-04 32.6700 8.4942
2018-02-05 2263.0120 74.8820
2018-02-06 904.3540 204.3158
2018-02-09 773.7640 -411.9726
2018-02-10 227.1030 28.1274
2018-02-11 1241.5160 130.1018
2018-02-13 1058.4300 424.3345
2018-02-16 1337.4420 95.9756
2018-02-17 2964.8174 -383.5478
2018-02-18 287.3260 62.4082
2018-02-19 1314.5900 377.0515
2018-02-20 1150.2900 -107.5121
2018-02-21 47.9040 -2.9940
2018-02-23 117.8000 42.3700
2018-02-24 1448.6760 249.3929
2018-02-25 430.4920 -19.3798
2018-02-26 2847.6460 447.3532
2018-02-28 17.6200 8.2242
Sales Profit
Order Date
2018-12-01 5331.178 718.8920
2018-12-02 9951.182 -7.3410
2018-12-03 1403.842 280.7407
2018-12-04 2639.638 -21.9881
2018-12-05 1453.136 447.6235
2018-12-06 10.680 2.8836
2018-12-07 2916.514 -2686.6673
2018-12-08 7643.041 1154.6045
2018-12-09 5470.390 1487.1418
2018-12-10 3873.559 715.5696
2018-12-11 2823.965 -82.4089
2018-12-13 580.936 99.2154
2018-12-14 3897.714 215.2500
2018-12-15 306.888 52.5946
# 绘制时间序列图:2018年每天的销售额(Sales)和利润(Profit)
import plotly.graph_objects as go
import pandas as pd
data = df.groupby('Order Date')[['Sales','Profit']].sum()
data = data.loc['2018']
fig = go.Figure()
fig.add_trace(go.Scatter(
x=data.index,
y=data['Sales'],
name='Sales'
))
fig.add_trace(go.Scatter(
x=data.index,
y=data['Profit'],
name='Profit'
))
fig.update_traces(opacity=0.8)
fig.update_layout(
title='Sales and Profit in 2018'
)
fig.show()
导入数据文件'Sample - Superstore.xls',绘制时间序列图,展示2018年每个月的销售额(Sales)和利润(Profit)。
对Order Date进行groupby操作后,时间戳是每天(D),如果想要将其转换为每月(M),可以通过重新采样来实现。重新采样是指将时间序列从一个频率转换为另一个频率的过程。将更高频率的数据聚合到低频率被称为向下采样,反之则称为向上采样。Pandas对象配有resample方法,与groupby方法类似,调用resample时需要对数据分组,之后再调用聚合函数。
# 重新采样
data = df.groupby('Order Date')[['Sales','Profit']].sum()
data = data.loc['2018'].resample('M').sum()
data
| Sales | Profit | |
|---|---|---|
| Order Date | ||
| 2018-01-31 | 43971.3740 | 7140.4391 |
| 2018-02-28 | 20301.1334 | 1613.8720 |
| 2018-03-31 | 58872.3528 | 14751.8915 |
| 2018-04-30 | 36521.5361 | 933.2900 |
| 2018-05-31 | 44261.1102 | 6342.5828 |
| 2018-06-30 | 52981.7257 | 8223.3357 |
| 2018-07-31 | 45264.4160 | 6952.6212 |
| 2018-08-31 | 63120.8880 | 9040.9557 |
| 2018-09-30 | 87866.6520 | 10991.5556 |
| 2018-10-31 | 77776.9232 | 9275.2755 |
| 2018-11-30 | 118447.8250 | 9690.1037 |
| 2018-12-31 | 83829.3188 | 8483.3468 |
data.index.strftime('%Y-%m') # 转换时间格式
Index(['2018-01', '2018-02', '2018-03', '2018-04', '2018-05', '2018-06',
'2018-07', '2018-08', '2018-09', '2018-10', '2018-11', '2018-12'],
dtype='object', name='Order Date')
# 绘制时间序列图:2018年每个月的销售额(Sales)和利润(Profit)
fig = go.Figure()
fig.add_trace(go.Scatter(
x=data.index.strftime('%Y-%m'),
y=data['Sales'],
name='Sales',
marker_color='#ff7043'
))
fig.add_trace(go.Scatter(
x=data.index.strftime('%Y-%m'),
y=data['Profit'],
name='Profit',
marker_color='#29b6f6'
))
fig.update_layout(
title='Sales and Profit in 2018',
xaxis=dict(dtick='M1') # X轴刻度显示为每一个月
)
fig.show()
导入数据文件'Sample - Superstore.xls',绘制饼图,展示每个地区(Region)销售额(Sales)总和的占比情况。
# 数据分析:得到每个地区的销售额总和
import pandas as pd
df = pd.read_excel('Sample - Superstore.xls',sheet_name='Orders')
data = df.groupby('Region')['Sales'].sum()
data
Region Central 501239.8908 East 678781.2400 South 391721.9050 West 725457.8245 Name: Sales, dtype: float64
# 绘制饼图,并在饼图上显示地区和占比
import plotly.graph_objects as go
fig = go.Figure(data = go.Pie(
labels = data.index,
values = data.values,
textinfo = 'label+percent',
opacity = 0.9
))
fig.update_layout(
title='Sales by Region',
)
fig.show()
导入数据文件'Sample - Superstore.xls',绘制两个饼图,分别展示2015年和2018年每个地区(Region)销售额(Sales)总和的占比情况。
# 数据分析:得到2015年和2018年每个地区的销售额总和
import pandas as pd
df = pd.read_excel('Sample - Superstore.xls',sheet_name='Orders')
data1 = df[df['Order Date'].array.year==2015]
data1 = data1.groupby('Region')['Sales'].sum()
data2 = df[df['Order Date'].array.year==2018]
data2 = data2.groupby('Region')['Sales'].sum()
# 将两组数据进行合并,赋值给data变量
data = pd.merge(data1,data2,left_index = True,right_index = True, suffixes = ('_2015','_2018'))
data
| Sales_2015 | Sales_2018 | |
|---|---|---|
| Region | ||
| Central | 103838.1646 | 147098.1282 |
| East | 128680.4570 | 213082.9040 |
| South | 103845.8435 | 122905.8575 |
| West | 147883.0330 | 250128.3655 |
# 绘制多个饼图 Pie Charts in subplots
# 方法1. 使用domain参数设置每个Pie的区域
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Pie(
labels = data.index,
values = data['Sales_2015'],
name = 'Sales in 2015',
domain = dict(x = [0,0.5]) # 设置domain参数
))
fig.add_trace(go.Pie(
labels = data.index,
values = data['Sales_2018'],
name = 'Sales in 2018',
domain = dict(x = [0.5,1]) # 设置domain参数
))
fig.update_traces(
opacity=0.9,
hole=0.4,
# scalegroup 如果多个trace属于同一个scalegroup,将会根据总数值来显示面积比例
scalegroup='one'
)
fig.update_layout(title = 'Sales by Region')
fig.show()
# 方法2. 使用make_subplots方法创建多子图
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# 使用make_subplots方法创建多子图
fig = make_subplots(rows = 1, cols = 2, # 设置行数和列数
specs = [[{'type':'domain'}, {'type':'domain'}]], # 设置每个子图的类型,当绘制饼图时类型为domain
subplot_titles=['Sales in 2015','Sales in 2018']) # 设置每个子图的标题
fig.add_trace(trace = go.Pie( labels = data.index,
values = data['Sales_2015']),
row = 1, col = 1 ) # 设置该子图的位置
fig.add_trace(trace = go.Pie( labels = data.index,
values = data['Sales_2018']),
row = 1, col = 2 ) # 设置该子图的位置
fig.update_traces(
opacity=0.9,
hole=0.4,
# scalegroup 如果多个trace属于同一个scalegroup,将会根据总数值来显示面积比例
scalegroup='two'
)
fig.update_layout(title = 'Sales by Region')
fig.show()
导入数据文件'Sample - Superstore.xls',绘制旭日图,展示商品类别(Category)和商品子类别(Sub-Category)之间的层级关系,环块的宽度体现销售额(Sales)总和。
# 数据分析:得到每个类别-子类别的销售额总和
import pandas as pd
df = pd.read_excel('Sample - Superstore.xls',sheet_name='Orders')
# 先对'Category'-'Sub-Category'进行分组,计算每组'Sales'总和
data = df.groupby(['Category', 'Sub-Category'])['Sales'].sum()
data # 得到一个Multi-Index的Series
Category Sub-Category
Furniture Bookcases 114879.9963
Chairs 328449.1030
Furnishings 91705.1640
Tables 206965.5320
Office Supplies Appliances 107532.1610
Art 27118.7920
Binders 203412.7330
Envelopes 16476.4020
Fasteners 3024.2800
Labels 12486.3120
Paper 78479.2060
Storage 223843.6080
Supplies 46673.5380
Technology Accessories 167380.3180
Copiers 149528.0300
Machines 189238.6310
Phones 330007.0540
Name: Sales, dtype: float64
# 绘制的旭日图有两层,内层是Category(父节点3个),外层是Sub-Category(子节点)
labels = list(data.index.levels[0]) # labels列表包括所有节点标签,初值为3个父节点
parents = ['']*3 # parents列表初值为3个根节点,即空字符串
for (i1,i2) in data.index: # 遍历data的多层索引
labels.append(i2) # 通过循环向labels列表添加子节点
parents.append(i1) # 通过循环向parents列表添加子节点的父节点,即多个重复的Category
print('labels:',labels)
print('\nparents:',parents)
labels: ['Furniture', 'Office Supplies', 'Technology', 'Bookcases', 'Chairs', 'Furnishings', 'Tables', 'Appliances', 'Art', 'Binders', 'Envelopes', 'Fasteners', 'Labels', 'Paper', 'Storage', 'Supplies', 'Accessories', 'Copiers', 'Machines', 'Phones'] parents: ['', '', '', 'Furniture', 'Furniture', 'Furniture', 'Furniture', 'Office Supplies', 'Office Supplies', 'Office Supplies', 'Office Supplies', 'Office Supplies', 'Office Supplies', 'Office Supplies', 'Office Supplies', 'Office Supplies', 'Technology', 'Technology', 'Technology', 'Technology']
# valuaes列表是每个labels对应的数值,因此是3个Category的Sales总和 + 每个Sub-Category的Sales总和
values = list(data.groupby(level='Category').sum())+list(data.values)
print('values:',values)
values: [741999.7953, 719047.032, 836154.033, 114879.9963, 328449.103, 91705.164, 206965.532, 107532.161, 27118.792, 203412.733, 16476.402000000002, 3024.28, 12486.312, 78479.206, 223843.608, 46673.538, 167380.318, 149528.03, 189238.631, 330007.054]
# 绘制旭日图
import plotly.graph_objects as go
fig = go.Figure(data = go.Sunburst(
labels = labels,
parents = parents,
values = values,
branchvalues='total',
# 可选参数包括['label', 'text', 'value', 'current path', 'percent root', 'percent entry', 'percent parent']
textinfo ='label + percent parent' # 显示标签和所占父节点的百分比
))
fig.update_layout(
title='Sales by Sub-Category',
margin = dict(l=0, r=0, b=0)
)
fig.show()
导入数据文件'Sample - Superstore.xls',绘制层级图,展示商品类别(Category)和商品子类别(Sub-Category)之间的层级关系,矩形版块的面积大小体现销售额(Sales)总和。
# 数据分析:得到每个类别-子类别的销售额总和
import pandas as pd
df = pd.read_excel('Sample - Superstore.xls',sheet_name='Orders')
# 先对'Category'-'Sub-Category'进行分组,计算每组'Sales'总和
data = df.groupby(['Category', 'Sub-Category'])['Sales'].sum()
data # 得到一个Multi-Index的Series
Category Sub-Category
Furniture Bookcases 114879.9963
Chairs 328449.1030
Furnishings 91705.1640
Tables 206965.5320
Office Supplies Appliances 107532.1610
Art 27118.7920
Binders 203412.7330
Envelopes 16476.4020
Fasteners 3024.2800
Labels 12486.3120
Paper 78479.2060
Storage 223843.6080
Supplies 46673.5380
Technology Accessories 167380.3180
Copiers 149528.0300
Machines 189238.6310
Phones 330007.0540
Name: Sales, dtype: float64
# 该层级图有两层,外部矩形代表父类别(Category),内部矩形代表子类别(Sub-Category)
labels = list(data.index.levels[0]) # labels列表包括所有节点标签,初值为3个父节点
parents = ['']*3 # parents列表初值为3个根节点,即空字符串
for (i,v) in data.index: # 遍历data的多层索引
labels.append(v) # 通过循环向labels列表添加子节点
parents.append(i) # 通过循环向parents列表添加子节点对应的父节点,即多个重复的Category
print('labels:',labels)
print('\nparents:',parents)
labels: ['Furniture', 'Office Supplies', 'Technology', 'Bookcases', 'Chairs', 'Furnishings', 'Tables', 'Appliances', 'Art', 'Binders', 'Envelopes', 'Fasteners', 'Labels', 'Paper', 'Storage', 'Supplies', 'Accessories', 'Copiers', 'Machines', 'Phones'] parents: ['', '', '', 'Furniture', 'Furniture', 'Furniture', 'Furniture', 'Office Supplies', 'Office Supplies', 'Office Supplies', 'Office Supplies', 'Office Supplies', 'Office Supplies', 'Office Supplies', 'Office Supplies', 'Office Supplies', 'Technology', 'Technology', 'Technology', 'Technology']
# values列表是每个labels对应的数值,因此是3个Category的Sales总和 连接(+) 每个Sub-Category的Sales总和
values = list(data.groupby(level='Category').sum())+list(data.values)
print('values:',values)
values: [741999.7953, 719047.032, 836154.033, 114879.9963, 328449.103, 91705.164, 206965.532, 107532.161, 27118.792, 203412.733, 16476.402000000002, 3024.28, 12486.312, 78479.206, 223843.608, 46673.538, 167380.318, 149528.03, 189238.631, 330007.054]
# 绘制层级图:将三个不同的Category设置成三种离散的颜色
fig = go.Figure(go.Treemap(
labels = labels,
parents = parents,
values = values,
branchvalues='total',
textinfo ='label + value + percent parent', # 显示标签、值以及所占父节点的百分比
root_color = 'white'
))
fig.update_layout(
title='Sales by Sub-Category',
treemapcolorway = ['lightblue','lightgrey','pink']
)
fig.show()
# 绘制层级图:将每个标签所在板块的颜色随着销量的值深浅变化
fig = go.Figure(go.Treemap(
labels = labels,
parents = parents,
values = values,
branchvalues='total',
textinfo ='label + value + percent parent', # 显示标签、值以及所占父节点的百分比
marker = dict(colors = values, colorscale = 'Greens',showscale = True),
))
fig.update_layout(
title='Sales by Sub-Category',
)
fig.show()
导入数据文件'Sample - Superstore.xls',绘制层级图,展示地区(Region)和州(States)之间的层级,矩形板块的面积体现销售额(Sales)总和,矩形板块的颜色体现利润(Profit)总和。其中文本显示标签和销量全国百分比,交互时显示值和区域占比。
import numpy as np
import plotly.graph_objects as go
# 先对'Region'-'State'进行分组,计算每组'Sales'总和
data = df.groupby(['Region','State'])[['Sales','Profit']].sum()
# 绘制的旭日图有两层,内层是Region(父节点4个),外层是State(子节点)
labels = list(data.index.levels[0]) # labels列表包括所有节点标签,初值为4个父节点
parents = ['']*4 # len(data.index.levels[0]) # parents列表初值为4个根节点,即空字符串
for (i1,i2) in data.index: # 遍历data的多层索引
labels.append(i2) # 通过循环向labels列表添加子节点
parents.append(i1) # 通过循环向parents列表添加子节点的父节点,即多个重复的Region
# valuaes列表是每个labels对应的Sales总和,因此是4个Region的Sales+每个States的Sales
values = list(data['Sales'].groupby(level='Region').sum()) + list(data['Sales'].values)
# colors列表是每个labels对应的Profit总和,因此是4个Region的Profit+每个States的Profit
colors = list(data['Profit'].groupby(level='Region').sum()) + list(data['Profit'].values)
# text列表用于交互时显示的文本,同colors列表,也可使用for循环写成自定义样式
text = []
for i in range(len(labels)):
text.append(('Profit:{0}').format(np.round(colors[i],2)))
fig = go.Figure(data = go.Treemap(
labels = labels,
parents = parents,
values = values,
branchvalues = 'total',
marker = dict(colors = colors,
colorscale = [[0,'#f57c00'],[0.2,'white'],[1,'#1565c0']],
showscale = True),
# 可选参数包括['label', 'text', 'value', 'current path', 'percent root', 'percent entry', 'percent parent']
text=text,
hoverinfo ='value + percent parent + text', # 交互时显示值和所占父节点的百分比
textinfo ='label + percent root', # 文本显示标签和所占根节点百分比
))
fig.update_layout(
title = 'Sales and Profit by Region & States',
margin = dict(l=0, r=0, b=0)
)
fig.show()
导入数据文件'Sample - Superstore.xls',绘制直方图,展示每笔订单的利润(Profit)的分布情况。
# 数据分析:得到每笔订单的利润之和
import pandas as pd
df = pd.read_excel('Sample - Superstore.xls',sheet_name='Orders')
data = df.groupby('Order ID')['Profit'].sum()
data # 共有5009个订单,远小于记录数
Order ID
CA-2015-100006 109.6113
CA-2015-100090 -19.0890
CA-2015-100293 31.8696
CA-2015-100328 1.3257
CA-2015-100363 7.7192
...
US-2018-168802 5.9696
US-2018-169320 16.6721
US-2018-169488 26.5552
US-2018-169502 32.4527
US-2018-169551 -62.2895
Name: Profit, Length: 5009, dtype: float64
# 绘制直方图
import plotly.graph_objects as go
fig = go.Figure(go.Histogram(
x = data.values,
xbins = dict(
start=-500,
end=500,
size=50
),
marker_color='#EB89B5',
opacity=0.8
))
fig.update_layout(
title='Distribution of Orders\' Profit',
xaxis=dict(title='Profit'),
yaxis=dict(title='Count')
)
fig.show()
导入数据文件'Sample - Superstore.xls',绘制重叠直方图,比较产品子类别(Sub-Category)为'Chairs'和'Tables'利润(Profit)的分布情况。
# 数据分析:分别得到产品子类别(Sub-Category)为'Chairs'和'Tables'的每一条记录的利润(Profit),不做聚合
import pandas as pd
df = pd.read_excel('Sample - Superstore.xls',sheet_name='Orders')
data1 = df.loc[df['Sub-Category']=='Chairs','Profit']
data2 = df.loc[df['Sub-Category']=='Tables','Profit']
print('Number of \'Chairs\' records: ', data1.shape)
print('Number of \'Tables\' records: ', data2.shape)
Number of 'Chairs' records: (617,) Number of 'Tables' records: (319,)
# 绘制重叠直方图
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Histogram(
x = data1,
name = 'Chairs'
))
fig.add_trace(go.Histogram(
x = data2,
name = 'Tables'
))
fig.update_layout(
barmode='overlay',
title='Profit Distribution of Chairs and Tables',
xaxis=dict(title='Profit'),
yaxis=dict(title='Percent')
)
fig.update_traces(
histnorm='probability',
opacity=0.7,
xbins=dict(
start=-800,
end=800,
size=50
),
)
fig.show()
导入数据文件'Sample - Superstore.xls',绘制分布图,展示2018年不同用户细分(Segment)每笔订单的利润(Profit)分布情况。
# 数据分析:得到不同用户细分(Segment)中,每笔订单的利润(Profit)之和
import pandas as pd
df = pd.read_excel('Sample - Superstore.xls',sheet_name='Orders')
data = df[df['Order Date'].array.year == 2018]
data = data.groupby(['Segment','Order ID'])['Profit'].sum()
data
Segment Order ID
Consumer CA-2018-100013 12.2162
CA-2018-100097 302.4705
CA-2018-100111 1571.7964
CA-2018-100160 16.9641
CA-2018-100230 42.3643
...
Home Office US-2018-162558 -1034.8361
US-2018-163790 282.4405
US-2018-166037 157.4685
US-2018-166324 -52.0700
US-2018-169502 32.4527
Name: Profit, Length: 1687, dtype: float64
# 绘制分布图
import plotly.figure_factory as ff
hist_data = []
group_labels = []
colors = ['#393E46', '#2BCDC1', '#F66095'] # 自定义颜色列表
for i in data.index.levels[0]:
hist_data.append(data[i])
group_labels.append(i)
fig = ff.create_distplot(
hist_data,
group_labels,
bin_size = 10,
colors = colors,
show_curve=False,
)
fig.update_layout(
title = 'Profit Distribution by Segment in 2018',
xaxis = dict(range=[-200,200],title='Profit'),
yaxis = dict(title='Probability Density')
)
fig.show()